The data set

Read data

In smoking_status Unknown should be changed to NA.

Also, it can be ordered: never < formerly < smokes

Other predictors seem to be OK

df <- read_csv("data/healthcare-dataset-stroke-data.csv", col_types = "cfdfffffddcf", na = c("Unknown", "N/A"))
# if you set smoking_status to factor in col_types, na() won't work
df$smoking_status <- as_factor(df$smoking_status)
df$smoking_status <- fct_relevel(df$smoking_status, c("never smoked", "formerly smoked", "smokes"))
df$stroke <- factor(ifelse(df$stroke == 1, "yes", "no"), levels = c("no", "yes"))

df

skimr description

Skip id column

df$id <- NULL
skimr::skim_to_wide(df)
Data summary
Name Piped data
Number of rows 5110
Number of columns 11
_______________________
Column type frequency:
factor 8
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1.0 FALSE 3 Fem: 2994, Mal: 2115, Oth: 1
hypertension 0 1.0 FALSE 2 0: 4612, 1: 498
heart_disease 0 1.0 FALSE 2 0: 4834, 1: 276
ever_married 0 1.0 FALSE 2 Yes: 3353, No: 1757
work_type 0 1.0 FALSE 5 Pri: 2925, Sel: 819, chi: 687, Gov: 657
Residence_type 0 1.0 FALSE 2 Urb: 2596, Rur: 2514
smoking_status 1544 0.7 FALSE 3 nev: 1892, for: 885, smo: 789
stroke 0 1.0 FALSE 2 no: 4861, yes: 249

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1.00 43.23 22.61 0.08 25.00 45.00 61.00 82.00 ▅▆▇▇▆
avg_glucose_level 0 1.00 106.15 45.28 55.12 77.24 91.88 114.09 271.74 ▇▃▁▁▁
bmi 201 0.96 28.89 7.85 10.30 23.50 28.10 33.10 97.60 ▇▇▁▁▁

Target ‘stroke’ is imbalanced!

Smoking’s complete rate 0.7

BMI’s complete rate 0.96

One ‘Other’ gender to be removed

df <- df %>% filter(gender != "Other")

EDA

Pairs plot

GGally::ggpairs(df, aes(color = stroke, alpha = 0.2, dotsize = 0.02), 
        upper = list(continuous = GGally::wrap("cor", size = 2.5)),
        diag = list(continuous = "barDiag")) +
  scale_color_brewer(palette = "Set1", direction = -1) +
  scale_fill_brewer(palette = "Set1", direction = -1)

Detailed

Stroke vs Gender

gender <- df %>% group_by(stroke, gender) %>% summarize(N=n())

ggplot(gender, aes(stroke, N)) +
  geom_bar(aes(fill=gender), alpha = 0.8, stat = "identity", position = "fill")+
  scale_fill_brewer(palette = "Set2", direction = -1)+
  ylab("proportion")

Proportions in both stroke groups are roughly the same

Stroke vs Age

ggplot(df, aes(stroke, age)) + 
  geom_violin(alpha=0.3) +
  geom_jitter(alpha=0.2, size=0.8, width = 0.15, height = 0.1, aes(color = gender)) + 
  geom_boxplot(alpha = 0.2) +
  scale_color_brewer(palette = "Set2", direction = -1)

On average, age is higher among stroke-yes group, there is no interaction between age and gender

Stroke vs Hypertension

hyptens <- df %>% group_by(stroke, hypertension) %>% summarize(N=n())

hyptens
ggplot(hyptens, aes(stroke, N)) +
  geom_bar(aes(fill=hypertension), alpha = 0.8, stat = "identity", position = "fill")+
  scale_fill_brewer(palette = "Set2", direction = -1)+
  ylab("proportion")

Hypertension occurres more often among stroke-yes

Scaling & Normalization

Scale numeric features

df_scaled <- df %>% select(avg_glucose_level, age, bmi) %>% scale() %>% data.frame()

Make Dummies

I’ve decided to omit smoking_status completely - it won’t be dummified

# select vars
to_dum <- df %>% select(gender, ever_married, work_type, Residence_type)
# make an obj
dummies <- dummyVars(~ ., data=to_dum)
# apply it
df_dummy <- data.frame(predict(dummies, newdata=to_dum))

head(df_dummy)

Join scaled and dummies and the rest

df_proc <- bind_cols(df_scaled, df_dummy, select(df, hypertension, heart_disease, stroke))
df_proc

Imputation

I will use median imputation for BMI.

Smoking status will not be processed.

df_proc <- df_proc %>% 
  mutate(bmi = ifelse(is.na(bmi), median(bmi, na.rm = TRUE), bmi))

Modelling

Params tuning

ROC-optimization is better when data is imbalanced

fit_ctrl_xgb_roc <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 5,
                           repeats = 10, 
                           allowParallel = T,
                           classProbs = T,
                           summaryFunction = twoClassSummary)

fit_ctrl_xgb_kappa <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 5,
                           repeats = 10, 
                           allowParallel = T)

Split data

Imbalanced data - use SMOTE to create training data set, but not testing data set

set.seed(1234)
sample_set <- createDataPartition(y = df_proc$stroke, p = .75, list = FALSE)
df_train <- df_proc[sample_set,]
df_test <- df_proc[-sample_set,]

# DMwR::SMOTE for imbalanced data: over=225 and under=150 give me 1:1 ratio
df_train_smote <- SMOTE(stroke ~ ., data.frame(df_train), perc.over = 225, perc.under = 150)

df_train_smote %>% group_by(stroke) %>% summarise(N=n())

RF

Training and validation

ROC-optimized

set.seed(122)

fit_rf_roc <- train(stroke ~ ., 
                 data = df_train, 
                 metric = "ROC", 
                 method = "rf", 
                 trControl = fit_ctrl_xgb_roc,
                 tuneLength = 10,
                 verbosity = 0,
                 verbose = FALSE)

fit_rf_roc
## Random Forest 
## 
## 3832 samples
##   17 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 3066, 3065, 3065, 3066, 3066, 3066, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec       
##    2    0.7732195  1.0000000  0.000000000
##    3    0.7907778  1.0000000  0.000000000
##    5    0.7941623  0.9990123  0.003186344
##    7    0.7970675  0.9976955  0.010156472
##    8    0.7995754  0.9971742  0.012830725
##   10    0.7977758  0.9964609  0.017140825
##   12    0.7998823  0.9965706  0.025689900
##   13    0.7999289  0.9961866  0.026216216
##   15    0.8015626  0.9958025  0.027866287
##   17    0.8006381  0.9950343  0.029985775
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.

Kappa-optimized

set.seed(123)

fit_rf_acc <- train(stroke ~ ., 
                 data = df_train, 
                 metric = "Kappa", 
                 method = "rf", 
                 trControl = fit_ctrl_xgb_kappa,
                 verbosity = 0,
                 verbose = FALSE)

fit_rf_acc
## Random Forest 
## 
## 3832 samples
##   17 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 3066, 3065, 3066, 3066, 3065, 3065, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##    2    0.9512008  0.00000000
##    9    0.9491650  0.01928477
##   17    0.9481212  0.03618828
## 
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.

Features importance

imp_vars_rf <- varImp(fit_rf_roc)

plot(imp_vars_rf, main="Variable Importance with RF")

Testing

ROC-optimized model

predClasses_rf_roc <- predict(fit_rf_roc, newdata=df_test)

cm_rf_roc <- confusionMatrix(data = predClasses_rf_roc, 
                reference = df_test$stroke,
                mode="everything",
                positive="yes")

cm_rf_roc
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  958  16
##        yes 257  46
##                                           
##                Accuracy : 0.7862          
##                  95% CI : (0.7627, 0.8084)
##     No Information Rate : 0.9514          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1865          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.74194         
##             Specificity : 0.78848         
##          Pos Pred Value : 0.15182         
##          Neg Pred Value : 0.98357         
##               Precision : 0.15182         
##                  Recall : 0.74194         
##                      F1 : 0.25205         
##              Prevalence : 0.04855         
##          Detection Rate : 0.03602         
##    Detection Prevalence : 0.23727         
##       Balanced Accuracy : 0.76521         
##                                           
##        'Positive' Class : yes             
## 

XGB

Training and validation

ROC optimized

set.seed(123)

fit_xgb_roc <- train(stroke ~ ., 
                 data = df_train, 
                 method = "xgbDART",
                 metric = "ROC", 
                 trControl = fit_ctrl_xgb_roc,
                 verbose = FALSE,
                 verbosity = 0)

fit_xgb_roc$bestTune

Show metrics

fit_xgb_roc$results %>% filter(nrounds == 50, max_depth == 2, eta == 0.3, gamma == 0, subsample == 1, rate_drop==0.5, colsample_bytree == 0.6, skip_drop == 0.95)

Features importance

imp_vars_xgb <- varImp(fit_xgb_roc)

plot(imp_vars_xgb, main="Variable Importance with XGB")

Testing

predClasses_xgb_roc <- predict(fit_xgb_roc, newdata=df_test)

cm_xgb_roc <- confusionMatrix(data = predClasses_xgb_roc, 
                reference = df_test$stroke,
                mode="everything",
                positive="yes")

cm_xgb_roc
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  921  16
##        yes 294  46
##                                           
##                Accuracy : 0.7572          
##                  95% CI : (0.7328, 0.7805)
##     No Information Rate : 0.9514          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1599          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.74194         
##             Specificity : 0.75802         
##          Pos Pred Value : 0.13529         
##          Neg Pred Value : 0.98292         
##               Precision : 0.13529         
##                  Recall : 0.74194         
##                      F1 : 0.22886         
##              Prevalence : 0.04855         
##          Detection Rate : 0.03602         
##    Detection Prevalence : 0.26625         
##       Balanced Accuracy : 0.74998         
##                                           
##        'Positive' Class : yes             
## 

ROC & AUC

get_roc <- function(fit.obj, testing.df){
  pred_prob <- predict.train(fit.obj, newdata = testing.df, type="prob")
  pred_roc <- prediction(predictions = pred_prob$yes, labels = testing.df$stroke)
  perf_roc <- performance(pred_roc, measure="tpr", x.measure = "fpr")
  return(list(perf_roc, pred_roc))
}

RF

# calculate ROC
perf_pred <- get_roc(fit_rf_roc, df_test)
perf_roc_rf <- perf_pred[[1]]
pred_roc_rf <- perf_pred[[2]]

# take AUC 
auc_rf <- round(unlist(slot(performance(pred_roc_rf, measure = "auc"), "y.values")), 3)

# plot
plot(perf_roc_rf, main = "RF ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_rf))

XGB

# calculate ROC
perf_pred <- get_roc(fit_xgb_roc, df_test)
perf_roc_xgb <- perf_pred[[1]]
pred_roc_xgb <- perf_pred[[2]]


# take AUC 
auc_xgb <- round(unlist(slot(performance(pred_roc_xgb, measure = "auc"), "y.values")), 3)

# plot
plot(perf_roc_xgb, main = "XGB ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_xgb))


save.image("workspace.RData")